import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dalex as dx
# -- model
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
# -- pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
# -- model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, average_precision_score
from sklearn.metrics import roc_curve, auc, roc_auc_score, precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
!pip install imbalanced-learn
Requirement already satisfied: imbalanced-learn in /opt/anaconda3/lib/python3.9/site-packages (0.12.0) Requirement already satisfied: numpy>=1.17.3 in /opt/anaconda3/lib/python3.9/site-packages (from imbalanced-learn) (1.26.4) Requirement already satisfied: scipy>=1.5.0 in /opt/anaconda3/lib/python3.9/site-packages (from imbalanced-learn) (1.12.0) Requirement already satisfied: scikit-learn>=1.0.2 in /opt/anaconda3/lib/python3.9/site-packages (from imbalanced-learn) (1.0.2) Requirement already satisfied: joblib>=1.1.1 in /opt/anaconda3/lib/python3.9/site-packages (from imbalanced-learn) (1.3.2) Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/anaconda3/lib/python3.9/site-packages (from imbalanced-learn) (2.2.0)
loan = pd.read_csv('/Users/helenas/Desktop/Machine Learning/Project/Final_project_20240308/loan_train.csv')
loan.head()
| id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | ... | next_pymnt_d | last_credit_pull_d | collections_12_mths_ex_med | policy_code | application_type | acc_now_delinq | chargeoff_within_12_mths | delinq_amnt | pub_rec_bankruptcies | tax_liens | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1077501.0 | 1296599.0 | 5000.0 | 5000.0 | 4975.0 | 36 months | 10.65% | 162.87 | B | B2 | ... | NaN | Sep-2016 | 0.0 | 1.0 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 1077430.0 | 1314167.0 | 2500.0 | 2500.0 | 2500.0 | 60 months | 15.27% | 59.83 | C | C4 | ... | NaN | Sep-2016 | 0.0 | 1.0 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 1076863.0 | 1277178.0 | 10000.0 | 10000.0 | 10000.0 | 36 months | 13.49% | 339.31 | C | C1 | ... | NaN | Apr-2016 | 0.0 | 1.0 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 1069639.0 | 1304742.0 | 7000.0 | 7000.0 | 7000.0 | 60 months | 15.96% | 170.08 | C | C5 | ... | NaN | Sep-2016 | 0.0 | 1.0 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 1072053.0 | 1288686.0 | 3000.0 | 3000.0 | 3000.0 | 36 months | 18.64% | 109.43 | E | E1 | ... | NaN | Dec-2014 | 0.0 | 1.0 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 52 columns
loan
| id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | ... | next_pymnt_d | last_credit_pull_d | collections_12_mths_ex_med | policy_code | application_type | acc_now_delinq | chargeoff_within_12_mths | delinq_amnt | pub_rec_bankruptcies | tax_liens | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1077501.0 | 1296599.0 | 5000.0 | 5000.0 | 4975.0 | 36 months | 10.65% | 162.87 | B | B2 | ... | NaN | Sep-2016 | 0.0 | 1.0 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 1077430.0 | 1314167.0 | 2500.0 | 2500.0 | 2500.0 | 60 months | 15.27% | 59.83 | C | C4 | ... | NaN | Sep-2016 | 0.0 | 1.0 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 1076863.0 | 1277178.0 | 10000.0 | 10000.0 | 10000.0 | 36 months | 13.49% | 339.31 | C | C1 | ... | NaN | Apr-2016 | 0.0 | 1.0 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 1069639.0 | 1304742.0 | 7000.0 | 7000.0 | 7000.0 | 60 months | 15.96% | 170.08 | C | C5 | ... | NaN | Sep-2016 | 0.0 | 1.0 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 1072053.0 | 1288686.0 | 3000.0 | 3000.0 | 3000.0 | 36 months | 18.64% | 109.43 | E | E1 | ... | NaN | Dec-2014 | 0.0 | 1.0 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 29772 | 72998.0 | 72992.0 | 1000.0 | 1000.0 | 0.0 | 36 months | 9.64% | 32.11 | B | B4 | ... | Jul-2010 | Sep-2014 | NaN | 1.0 | INDIVIDUAL | NaN | NaN | NaN | NaN | NaN |
| 29773 | 72176.0 | 70868.0 | 2525.0 | 2525.0 | 225.0 | 36 months | 9.33% | 80.69 | B | B3 | ... | Jul-2010 | May-2007 | NaN | 1.0 | INDIVIDUAL | NaN | NaN | NaN | NaN | NaN |
| 29774 | 71623.0 | 70735.0 | 6500.0 | 6500.0 | 0.0 | 36 months | 8.38% | 204.84 | A | A5 | ... | Jul-2010 | Aug-2007 | NaN | 1.0 | INDIVIDUAL | NaN | NaN | NaN | NaN | NaN |
| 29775 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 29776 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
29777 rows × 52 columns
event_label_distribution = loan.loan_status.value_counts(normalize=True)
event_label_distribution
current 0.849649 default 0.150351 Name: loan_status, dtype: float64
plt.figure(figsize=(10, 6))
bars = plt.bar(event_label_distribution.index, event_label_distribution.values, color='skyblue')
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.2f}', ha='center', va='bottom') # Removed the '%' sign
plt.title('Distribution of default')
plt.xlabel('EVENT_LABEL')
plt.ylabel('Percentage')
# Get rid of grid lines
plt.grid(False)
# Remove outlines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.show()
missing_pct = loan.isnull().sum().div(loan.shape[0]).apply(lambda x:format(x,'.2%')).reset_index()
missing_pct.columns = ['Variable Name', 'Missing Percentage']
missing_pct
| Variable Name | Missing Percentage | |
|---|---|---|
| 0 | id | 0.01% |
| 1 | member_id | 0.01% |
| 2 | loan_amnt | 0.01% |
| 3 | funded_amnt | 0.01% |
| 4 | funded_amnt_inv | 0.01% |
| 5 | term | 0.01% |
| 6 | int_rate | 0.01% |
| 7 | installment | 0.01% |
| 8 | grade | 0.01% |
| 9 | sub_grade | 0.01% |
| 10 | emp_title | 6.12% |
| 11 | emp_length | 2.56% |
| 12 | home_ownership | 0.01% |
| 13 | annual_inc | 0.01% |
| 14 | verification_status | 0.01% |
| 15 | issue_d | 0.01% |
| 16 | loan_status | 0.00% |
| 17 | pymnt_plan | 0.01% |
| 18 | url | 0.01% |
| 19 | desc | 31.68% |
| 20 | purpose | 0.01% |
| 21 | title | 0.05% |
| 22 | zip_code | 0.01% |
| 23 | addr_state | 0.01% |
| 24 | dti | 0.01% |
| 25 | delinq_2yrs | 0.08% |
| 26 | earliest_cr_line | 0.08% |
| 27 | fico_range_low | 0.01% |
| 28 | fico_range_high | 0.01% |
| 29 | inq_last_6mths | 0.08% |
| 30 | mths_since_last_delinq | 63.50% |
| 31 | mths_since_last_record | 91.37% |
| 32 | open_acc | 0.08% |
| 33 | pub_rec | 0.08% |
| 34 | revol_bal | 0.01% |
| 35 | revol_util | 0.23% |
| 36 | total_acc | 0.08% |
| 37 | out_prncp | 0.01% |
| 38 | out_prncp_inv | 0.01% |
| 39 | total_rec_late_fee | 0.01% |
| 40 | last_pymnt_d | 0.23% |
| 41 | last_pymnt_amnt | 0.01% |
| 42 | next_pymnt_d | 92.10% |
| 43 | last_credit_pull_d | 0.02% |
| 44 | collections_12_mths_ex_med | 0.35% |
| 45 | policy_code | 0.01% |
| 46 | application_type | 0.01% |
| 47 | acc_now_delinq | 0.08% |
| 48 | chargeoff_within_12_mths | 0.35% |
| 49 | delinq_amnt | 0.08% |
| 50 | pub_rec_bankruptcies | 3.24% |
| 51 | tax_liens | 0.27% |
loan.columns
Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose',
'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
'earliest_cr_line', 'fico_range_low', 'fico_range_high',
'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record',
'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
'out_prncp', 'out_prncp_inv', 'total_rec_late_fee', 'last_pymnt_d',
'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
'collections_12_mths_ex_med', 'policy_code', 'application_type',
'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt',
'pub_rec_bankruptcies', 'tax_liens'],
dtype='object')
cat = ['term', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership',
'purpose', 'zip_code', 'addr_state', 'verification_status']
num = ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'installment', 'annual_inc',
'dti', 'fico_range_low', 'fico_range_high', 'inq_last_6mths', 'open_acc', 'pub_rec',
'revol_bal', 'total_acc', 'pub_rec_bankruptcies']
rate_cat_to_num = ['int_rate', 'revol_util']
date_cat_to_num = ['issue_d', 'earliest_cr_line', 'last_credit_pull_d']
# Transform rates into numeric
loan['int_rate'] = loan['int_rate'].str.strip('%').astype('float64')
loan['revol_util'] = loan['revol_util'].str.strip('%').astype('float64')
# Calculate the count of months from each date up to Dec-2023
def count_month(feature):
dates_datetime = pd.to_datetime(loan[feature], format='%b-%Y')
end_date = pd.Timestamp('Dec-2023')
loan[feature] = ((end_date.year - dates_datetime.dt.year) * 12 + (end_date.month - dates_datetime.dt.month))
for i in date_cat_to_num:
count_month(i)
numeric_features = num + rate_cat_to_num + date_cat_to_num
categorical_features = cat
print('Number of numeric features:', len(numeric_features))
print(numeric_features)
print('Number of categorical features:', len(categorical_features))
print(categorical_features)
Number of numeric features: 19 ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'installment', 'annual_inc', 'dti', 'fico_range_low', 'fico_range_high', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'pub_rec_bankruptcies', 'int_rate', 'revol_util', 'issue_d', 'earliest_cr_line', 'last_credit_pull_d'] Number of categorical features: 10 ['term', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'purpose', 'zip_code', 'addr_state', 'verification_status']
# Need to use a previous version of pandas to match seaborn
# !pip uninstall pandas
# !pip install pandas==1.5.2
print(pd.__version__)
1.5.2
# Setting the aesthetic style of the plots
sns.set_style("whitegrid")
for col in numeric_features:
plt.figure(figsize=(8, 4))
sns.histplot(data=loan, x=col, hue='loan_status', kde=True, bins=30)
plt.title(f'Distribution of {col}')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.show()
# Compute the correlation matrix
corr = loan[numeric_features].corr()
# Generate a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, square=True)
plt.title('Correlation Matrix of Numeric Features')
plt.show()
loan[numeric_features].corr()
| loan_amnt | funded_amnt | funded_amnt_inv | installment | annual_inc | dti | fico_range_low | fico_range_high | inq_last_6mths | open_acc | pub_rec | revol_bal | total_acc | pub_rec_bankruptcies | int_rate | revol_util | issue_d | earliest_cr_line | last_credit_pull_d | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| loan_amnt | 1.000000 | 0.981859 | 0.928723 | 0.931040 | 0.266371 | 0.065216 | 0.129247 | 0.129247 | -0.030824 | 0.181247 | -0.052624 | 0.251019 | 0.258210 | -0.037727 | 0.292174 | 0.066071 | -0.156024 | 0.191375 | -0.082409 |
| funded_amnt | 0.981859 | 1.000000 | 0.946322 | 0.956712 | 0.262227 | 0.064302 | 0.121071 | 0.121071 | -0.030344 | 0.178910 | -0.053025 | 0.246531 | 0.252556 | -0.038008 | 0.295130 | 0.069582 | -0.162045 | 0.184086 | -0.083706 |
| funded_amnt_inv | 0.928723 | 0.946322 | 1.000000 | 0.888913 | 0.241750 | 0.069469 | 0.141781 | 0.141781 | -0.078271 | 0.159299 | -0.056516 | 0.203765 | 0.240382 | -0.045950 | 0.278899 | 0.071051 | -0.323538 | 0.167630 | -0.135345 |
| installment | 0.931040 | 0.956712 | 0.888913 | 1.000000 | 0.266363 | 0.055694 | 0.058972 | 0.058972 | -0.010530 | 0.181188 | -0.045178 | 0.263561 | 0.236886 | -0.032186 | 0.272285 | 0.094331 | -0.078656 | 0.167946 | -0.038663 |
| annual_inc | 0.266371 | 0.262227 | 0.241750 | 0.266363 | 1.000000 | -0.110412 | 0.049251 | 0.049251 | 0.025374 | 0.160347 | -0.017614 | 0.274796 | 0.237292 | -0.016130 | 0.052965 | 0.019063 | -0.016405 | 0.181183 | 0.007781 |
| dti | 0.065216 | 0.064302 | 0.069469 | 0.055694 | -0.110412 | 1.000000 | -0.189247 | -0.189247 | 0.013047 | 0.299711 | -0.001398 | 0.187671 | 0.243310 | 0.010976 | 0.117192 | 0.278250 | -0.074106 | 0.055170 | -0.119582 |
| fico_range_low | 0.129247 | 0.121071 | 0.141781 | 0.058972 | 0.049251 | -0.189247 | 1.000000 | 1.000000 | -0.141032 | -0.030234 | -0.155576 | -0.025977 | 0.107180 | -0.133766 | -0.703473 | -0.544699 | -0.103232 | 0.237170 | 0.118787 |
| fico_range_high | 0.129247 | 0.121071 | 0.141781 | 0.058972 | 0.049251 | -0.189247 | 1.000000 | 1.000000 | -0.141032 | -0.030234 | -0.155576 | -0.025977 | 0.107180 | -0.133766 | -0.703473 | -0.544699 | -0.103232 | 0.237170 | 0.118787 |
| inq_last_6mths | -0.030824 | -0.030344 | -0.078271 | -0.010530 | 0.025374 | 0.013047 | -0.141032 | -0.141032 | 1.000000 | 0.097117 | 0.058491 | 0.013200 | 0.093722 | 0.043389 | 0.180887 | -0.041999 | 0.221717 | -0.005715 | 0.016138 |
| open_acc | 0.181247 | 0.178910 | 0.159299 | 0.181188 | 0.160347 | 0.299711 | -0.030234 | -0.030234 | 0.097117 | 1.000000 | 0.007975 | 0.259627 | 0.695284 | 0.012916 | 0.032538 | -0.075815 | 0.001768 | 0.236720 | -0.049497 |
| pub_rec | -0.052624 | -0.053025 | -0.056516 | -0.045178 | -0.017614 | -0.001398 | -0.155576 | -0.155576 | 0.058491 | 0.007975 | 1.000000 | -0.049576 | -0.014522 | 0.837373 | 0.101209 | 0.059303 | 0.032802 | 0.053310 | -0.041521 |
| revol_bal | 0.251019 | 0.246531 | 0.203765 | 0.263561 | 0.274796 | 0.187671 | -0.025977 | -0.025977 | 0.013200 | 0.259627 | -0.049576 | 1.000000 | 0.273481 | -0.042661 | 0.083661 | 0.228160 | 0.057318 | 0.229548 | -0.016127 |
| total_acc | 0.258210 | 0.252556 | 0.240382 | 0.236886 | 0.237292 | 0.243310 | 0.107180 | 0.107180 | 0.093722 | 0.695284 | -0.014522 | 0.273481 | 1.000000 | -0.001055 | -0.028642 | -0.058854 | -0.051623 | 0.386206 | -0.050321 |
| pub_rec_bankruptcies | -0.037727 | -0.038008 | -0.045950 | -0.032186 | -0.016130 | 0.010976 | -0.133766 | -0.133766 | 0.043389 | 0.012916 | 0.837373 | -0.042661 | -0.001055 | 1.000000 | 0.085994 | 0.061630 | 0.025503 | 0.058086 | -0.041230 |
| int_rate | 0.292174 | 0.295130 | 0.278899 | 0.272285 | 0.052965 | 0.117192 | -0.703473 | -0.703473 | 0.180887 | 0.032538 | 0.101209 | 0.083661 | -0.028642 | 0.085994 | 1.000000 | 0.457664 | -0.010112 | -0.115177 | -0.147004 |
| revol_util | 0.066071 | 0.069582 | 0.071051 | 0.094331 | 0.019063 | 0.278250 | -0.544699 | -0.544699 | -0.041999 | -0.075815 | 0.059303 | 0.228160 | -0.058854 | 0.061630 | 0.457664 | 1.000000 | -0.054309 | -0.044681 | -0.153284 |
| issue_d | -0.156024 | -0.162045 | -0.323538 | -0.078656 | -0.016405 | -0.074106 | -0.103232 | -0.103232 | 0.221717 | 0.001768 | 0.032802 | 0.057318 | -0.051623 | 0.025503 | -0.010112 | -0.054309 | 1.000000 | 0.031255 | 0.342261 |
| earliest_cr_line | 0.191375 | 0.184086 | 0.167630 | 0.167946 | 0.181183 | 0.055170 | 0.237170 | 0.237170 | -0.005715 | 0.236720 | 0.053310 | 0.229548 | 0.386206 | 0.058086 | -0.115177 | -0.044681 | 0.031255 | 1.000000 | 0.006728 |
| last_credit_pull_d | -0.082409 | -0.083706 | -0.135345 | -0.038663 | 0.007781 | -0.119582 | 0.118787 | 0.118787 | 0.016138 | -0.049497 | -0.041521 | -0.016127 | -0.050321 | -0.041230 | -0.147004 | -0.153284 | 0.342261 | 0.006728 | 1.000000 |
High correlation:
categorical_features_eda = [
'term',
'grade',
'sub_grade',
'emp_length',
'home_ownership',
'purpose',
'addr_state',
'verification_status']
# Total transaction count and default count for each categorical variable
filter_mask = loan["loan_status"] == 'default'
for feature in categorical_features_eda:
gb = loan[feature].value_counts().reset_index()
gb.columns = [feature, 'total_count']
print(gb)
plt.figure(figsize = (8,6))
sns.barplot(data = gb,
x = 'total_count',
y = feature,
color = "skyblue")
# Get rid of grid lines
plt.grid(False)
# Remove outlines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.xlabel("Number of total loans")
plt.ylabel(feature)
plt.title(f"Number of total loans by {feature}")
plt.show()
gbdefault = loan[filter_mask][feature].value_counts().reset_index()
gbdefault.columns = [feature, 'fraud_count']
print(gbdefault)
#####
plt.figure(figsize = (8,6))
sns.barplot(data = gbdefault,
x = 'fraud_count',
y = feature,
color = "skyblue")
# Get rid of grid lines
plt.grid(False)
# Remove outlines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.xlabel("Number of default loans")
plt.ylabel(feature)
plt.title(f"Number of default loans by {feature}")
plt.show()
print('---'*24)
term total_count 0 36 months 22160 1 60 months 7614
term fraud_count 0 36 months 2704 1 60 months 1773
------------------------------------------------------------------------ grade total_count 0 B 8620 1 A 7142 2 C 6068 3 D 4268 4 E 2391 5 F 909 6 G 376
grade fraud_count 0 B 1030 1 C 1020 2 D 951 3 E 618 4 A 433 5 F 298 6 G 127
------------------------------------------------------------------------ sub_grade total_count 0 B3 2088 1 A4 2044 2 A5 1957 3 B5 1932 4 B4 1774 5 C1 1601 6 B2 1486 7 C2 1461 8 B1 1340 9 A3 1244 10 C3 1178 11 A2 1109 12 D2 1072 13 C4 940 14 D3 922 15 C5 888 16 D4 806 17 A1 788 18 D1 748 19 D5 720 20 E1 609 21 E2 560 22 E3 475 23 E4 391 24 E5 356 25 F1 274 26 F2 210 27 F3 176 28 F4 147 29 F5 102 30 G1 102 31 G4 76 32 G2 75 33 G5 63 34 G3 60
sub_grade fraud_count 0 B5 267 1 C1 253 2 B4 237 3 C2 236 4 B3 231 5 D2 223 6 D3 215 7 C3 215 8 D4 188 9 D5 184 10 B2 161 11 A5 161 12 E1 160 13 C4 158 14 C5 158 15 E2 144 16 D1 141 17 B1 134 18 A4 126 19 E4 110 20 E3 109 21 E5 95 22 F1 84 23 A3 67 24 F2 60 25 A2 58 26 F4 54 27 F5 51 28 F3 49 29 G1 31 30 G2 30 31 G4 23 32 G5 22 33 A1 21 34 G3 21
------------------------------------------------------------------------ emp_length total_count 0 10+ years 6577 1 < 1 year 3491 2 2 years 3313 3 3 years 3008 4 4 years 2563 5 1 year 2537 6 5 years 2441 7 6 years 1681 8 7 years 1305 9 8 years 1126 10 9 years 973
emp_length fraud_count 0 10+ years 1042 1 < 1 year 544 2 2 years 456 3 3 years 417 4 1 year 385 5 5 years 367 6 4 years 363 7 6 years 245 8 7 years 199 9 8 years 159 10 9 years 132
------------------------------------------------------------------------ home_ownership total_count 0 RENT 14064 1 MORTGAGE 13340 2 OWN 2275 3 OTHER 91 4 NONE 4
home_ownership fraud_count 0 RENT 2211 1 MORTGAGE 1906 2 OWN 344 3 OTHER 15 4 NONE 1
------------------------------------------------------------------------
purpose total_count
0 debt_consolidation 13816
1 credit_card 3850
2 other 3108
3 home_improvement 2226
4 major_purchase 1643
5 small_business 1387
6 car 1118
7 wedding 704
8 medical 528
9 moving 448
10 house 313
11 educational 279
12 vacation 277
13 renewable_energy 77
purpose fraud_count 0 debt_consolidation 2144 1 other 512 2 credit_card 430 3 small_business 377 4 home_improvement 302 5 major_purchase 180 6 car 119 7 medical 93 8 moving 79 9 wedding 71 10 educational 60 11 house 55 12 vacation 42 13 renewable_energy 13
------------------------------------------------------------------------ addr_state total_count 0 CA 5188 1 NY 2836 2 FL 2200 3 TX 2067 4 NJ 1408 5 IL 1174 6 PA 1128 7 GA 1058 8 VA 1056 9 MA 1012 10 OH 912 11 MD 773 12 AZ 667 13 WA 617 14 NC 603 15 CT 575 16 CO 564 17 MI 551 18 MO 549 19 MN 449 20 NV 375 21 WI 361 22 SC 350 23 AL 339 24 OR 332 25 LA 312 26 KY 265 27 KS 212 28 OK 208 29 AR 192 30 UT 185 31 DC 164 32 NM 140 33 RI 133 34 WV 131 35 NH 128 36 HI 121 37 DE 95 38 WY 61 39 MT 60 40 AK 57 41 SD 48 42 VT 43 43 MS 19 44 IN 15 45 TN 14 46 IA 10 47 NE 9 48 ID 6 49 ME 2
addr_state fraud_count 0 CA 893 1 FL 400 2 NY 370 3 TX 262 4 NJ 225 5 GA 167 6 IL 166 7 PA 161 8 MA 134 9 VA 130 10 MD 125 11 OH 111 12 AZ 104 13 MO 100 14 WA 96 15 NC 91 16 NV 82 17 MI 82 18 CT 76 19 CO 66 20 MN 56 21 WI 55 22 OR 55 23 SC 52 24 KY 47 25 AL 39 26 LA 35 27 UT 35 28 OK 31 29 NM 28 30 KS 26 31 AR 22 32 HI 20 33 NH 19 34 WV 18 35 RI 15 36 DE 13 37 AK 11 38 DC 10 39 SD 10 40 MT 10 41 IN 6 42 VT 5 43 MS 5 44 NE 5 45 TN 3 46 WY 3 47 ID 1 48 IA 1
------------------------------------------------------------------------ verification_status total_count 0 Not Verified 13128 1 Verified 9460 2 Source Verified 7186
verification_status fraud_count 0 Not Verified 1855 1 Verified 1555 2 Source Verified 1067
------------------------------------------------------------------------
# Fraud rate for each categorical variable
for feature in categorical_features_eda:
gb1 = loan[filter_mask][feature].value_counts().reset_index()
gb2 = loan[feature].value_counts().reset_index()
gb1.columns = [feature, 'fraud_count']
gb2.columns = [feature, 'total_count']
gb = pd.merge(gb1, gb2, on = feature, how = 'right')
gb['fraud_rate(%)'] = gb['fraud_count'] / gb['total_count'] * 100
gb = gb.sort_values(by = 'fraud_rate(%)', ascending = False)
print(gb[[feature, 'fraud_rate(%)']])
plt.figure(figsize = (8,6))
sns.barplot(data=gb,
x='fraud_rate(%)',
y=feature,
color = 'skyblue')
# Get rid of grid lines
plt.grid(False)
# Remove outlines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.xlabel('Default Rate (%)')
plt.ylabel(feature)
plt.title(f"Default Rate by {feature}")
plt.show()
print('---'*24)
term fraud_rate(%) 1 60 months 23.286052 0 36 months 12.202166
------------------------------------------------------------------------ grade fraud_rate(%) 6 G 33.776596 5 F 32.783278 4 E 25.846926 3 D 22.282099 2 C 16.809492 0 B 11.948956 1 A 6.062728
------------------------------------------------------------------------ sub_grade fraud_rate(%) 29 F5 50.000000 32 G2 40.000000 28 F4 36.734694 34 G3 35.000000 33 G5 34.920635 25 F1 30.656934 30 G1 30.392157 31 G4 30.263158 26 F2 28.571429 23 E4 28.132992 27 F3 27.840909 24 E5 26.685393 20 E1 26.272578 21 E2 25.714286 19 D5 25.555556 16 D4 23.325062 14 D3 23.318872 22 E3 22.947368 12 D2 20.802239 18 D1 18.850267 10 C3 18.251273 15 C5 17.792793 13 C4 16.808511 7 C2 16.153320 5 C1 15.802623 3 B5 13.819876 4 B4 13.359639 0 B3 11.063218 6 B2 10.834455 8 B1 10.000000 2 A5 8.226878 1 A4 6.164384 9 A3 5.385852 11 A2 5.229937 17 A1 2.664975
------------------------------------------------------------------------ emp_length fraud_rate(%) 0 10+ years 15.843090 1 < 1 year 15.582928 8 7 years 15.249042 5 1 year 15.175404 6 5 years 15.034822 7 6 years 14.574658 4 4 years 14.163090 9 8 years 14.120782 3 3 years 13.863032 2 2 years 13.763960 10 9 years 13.566290
------------------------------------------------------------------------ home_ownership fraud_rate(%) 4 NONE 25.000000 3 OTHER 16.483516 0 RENT 15.720990 2 OWN 15.120879 1 MORTGAGE 14.287856
------------------------------------------------------------------------
purpose fraud_rate(%)
5 small_business 27.180966
11 educational 21.505376
9 moving 17.633929
8 medical 17.613636
10 house 17.571885
13 renewable_energy 16.883117
2 other 16.473616
0 debt_consolidation 15.518240
12 vacation 15.162455
3 home_improvement 13.566936
1 credit_card 11.168831
4 major_purchase 10.955569
6 car 10.644007
7 wedding 10.085227
------------------------------------------------------------------------ addr_state fraud_rate(%) 47 NE 55.555556 44 IN 40.000000 43 MS 26.315789 20 NV 21.866667 45 TN 21.428571 41 SD 20.833333 32 NM 20.000000 40 AK 19.298246 30 UT 18.918919 18 MO 18.214936 2 FL 18.181818 26 KY 17.735849 0 CA 17.212799 39 MT 16.666667 48 ID 16.666667 24 OR 16.566265 36 HI 16.528926 11 MD 16.170763 4 NJ 15.980114 7 GA 15.784499 12 AZ 15.592204 13 WA 15.559157 21 WI 15.235457 14 NC 15.091211 28 OK 14.903846 17 MI 14.882033 22 SC 14.857143 35 NH 14.843750 6 PA 14.273050 5 IL 14.139693 34 WV 13.740458 37 DE 13.684211 9 MA 13.241107 15 CT 13.217391 1 NY 13.046544 3 TX 12.675375 19 MN 12.472160 8 VA 12.310606 27 KS 12.264151 10 OH 12.171053 16 CO 11.702128 42 VT 11.627907 23 AL 11.504425 29 AR 11.458333 33 RI 11.278195 25 LA 11.217949 46 IA 10.000000 31 DC 6.097561 38 WY 4.918033 49 ME NaN
------------------------------------------------------------------------ verification_status fraud_rate(%) 1 Verified 16.437632 2 Source Verified 14.848316 0 Not Verified 14.130104
------------------------------------------------------------------------
target = 'loan_status'
# We only include useful numeric and categorical features
numeric_features = ['installment', 'annual_inc', 'dti', 'fico_range_low', 'inq_last_6mths',
'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'pub_rec_bankruptcies',
'int_rate', 'revol_util', 'issue_d', 'earliest_cr_line', 'last_credit_pull_d']
categorical_features = ['term', 'grade', 'sub_grade', 'emp_title', 'emp_length',
'home_ownership', 'purpose', 'zip_code', 'addr_state', 'verification_status']
# Define the target variable
y = loan['loan_status'].map({'current': 0, 'default': 1}) # Convert to binary
X = loan.drop('loan_status', axis=1)
X_train, X_test, y_train, y_test = train_test_split(loan[numeric_features + categorical_features], y, test_size=0.2, random_state=42)
def plot_class_distribution(y, title='Class distribution'):
counter = Counter(y)
plt.figure(figsize=(6, 3), dpi=100)
plt.bar(counter.keys(), counter.values(), color='skyblue')
plt.title(title)
plt.ylabel('Frequency')
plt.xlabel('Class')
plt.xticks(list(counter.keys()))
# Remove gridlines
plt.grid(False)
plt.show()
plot_class_distribution(y_train, 'Class distribution in Training Set')
# Use under-sampling to handle unbalanced data
rus = RandomUnderSampler(random_state=42, sampling_strategy=0.5)
X_train, y_train = rus.fit_resample(X_train, y_train)
plt.figure(figsize=(6, 3), dpi = 100)
plt.grid(False)
plot_class_distribution(y_train, 'Class distribution after Random Under-sampling 50%')
# Define transformer for preproccessing
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Put missing value in category 'missing'
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# Define evaluation function 'model_evaluation'
def model_evaluation(pipeline):
# Predictions for the training set
y_train_pred = pipeline.predict(X_train)
y_train_prob = pipeline.predict_proba(X_train)[:, 1]
# Predictions for the test set
y_test_pred = pipeline.predict(X_test)
y_test_prob = pipeline.predict_proba(X_test)[:, 1]
# Binarize labels for AUC calculation
lb = LabelBinarizer()
y_train_binarized = lb.fit_transform(y_train).ravel()
y_test_binarized = lb.transform(y_test).ravel()
# Calculating metrics
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
train_precision = precision_score(y_train, y_train_pred, )
test_precision = precision_score(y_test, y_test_pred, )
train_recall = recall_score(y_train, y_train_pred, )
test_recall = recall_score(y_test, y_test_pred, )
train_f1 = f1_score(y_train, y_train_pred, )
test_f1 = f1_score(y_test, y_test_pred, )
train_auc = roc_auc_score(y_train_binarized, y_train_prob)
test_auc = roc_auc_score(y_test_binarized, y_test_prob)
# Print Metrics
print("Training Metrics:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"Precision (default): {train_precision:.4f}")
print(f"Recall (default): {train_recall:.4f}")
print(f"F1 Score (default): {train_f1:.4f}")
print(f"AUC: {train_auc:.4f}")
print("\nTest Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision (default): {test_precision:.4f}")
print(f"Recall (default): {test_recall:.4f}")
print(f"F1 Score (default): {test_f1:.4f}")
print(f"AUC: {test_auc:.4f}")
# Fit baseline Logistic Regression Model
# Define the Logistic Regression pipeline
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression(max_iter = 300,
random_state = 42))])
# Train the Logistic Regression model
lr_pipeline.fit(X_train, y_train)
# Predict and evaluate the model
y_pred_lr = lr_pipeline.predict(X_test)
y_pred_proba_lr = lr_pipeline.predict_proba(X_test)[:,1]
# Evaluation Metrics
accuracy_lr = accuracy_score(y_test, y_pred_lr)
auc_lr = roc_auc_score(y_test, y_pred_proba_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
# Print the metrics
print("Baseline Logistic Regression Model Evaluation:")
print(f" Accuracy: {accuracy_lr:.4f}")
print(f" AUC: {auc_lr:.4f}")
print(f" Precision: {precision_lr:.4f}")
print(f" Recall: {recall_lr:.4f}")
Baseline Logistic Regression Model Evaluation: Accuracy: 0.8104 AUC: 0.8160 Precision: 0.4017 Recall: 0.5755
# Tuning
# Create the parameter grid
param_grid = {
'classifier__max_iter': [300],
'classifier__penalty': ['l2', 'elasticnet'],
'classifier__C': [0.01, 0.1, 1]
}
# Instantiate the GridSearchCV object
grid_search = GridSearchCV(lr_pipeline, param_grid, cv = 3, n_jobs = -1, scoring = 'roc_auc', verbose = 1)
# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)
# Get the best set of hyperparameters
best_params = grid_search.best_params_
# Print the best set of hyperparameters
print("Best parameters:")
for key, value in best_params.items():
print(f" {key}: {value}")
Fitting 3 folds for each of 6 candidates, totalling 18 fits Best parameters: classifier__C: 0.01 classifier__max_iter: 300 classifier__penalty: l2
# Fit the model with the best parameters in Gridsearch
lr_pipeline_hpo = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression(penalty = 'l2',
C = 0.1,
max_iter = 300,
random_state = 42))])
# Train the Logistic Regression model
lr_pipeline_hpo.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median')),
('scaler',
StandardScaler())]),
['installment', 'annual_inc',
'dti', 'fico_range_low',
'inq_last_6mths', 'open_acc',
'pub_rec', 'revol_bal',
'total_acc',
'pub_rec_bankruptcies',
'int_rate', 'revol_util',
'issue_d',
'earliest_cr_line',
'last_credit_pull_d']),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(fill_value='missing',
strategy='constant')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['term', 'grade', 'sub_grade',
'emp_title', 'emp_length',
'home_ownership', 'purpose',
'zip_code', 'addr_state',
'verification_status'])])),
('classifier',
LogisticRegression(C=0.1, max_iter=300, random_state=42))])
model_evaluation(lr_pipeline_hpo)
Training Metrics: Accuracy: 0.8097 Precision (default): 0.7344 Recall (default): 0.6721 F1 Score (default): 0.7019 AUC: 0.8694 Test Metrics: Accuracy: 0.8222 Precision (default): 0.4288 Recall (default): 0.6084 F1 Score (default): 0.5031 AUC: 0.8292
# Fit baseline Random Forest Model
# Define the Random Forest pipeline
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_estimators = 50,
min_samples_split = 10,
n_jobs = -1,
random_state = 42))])
# Train the Random Forest model
rf_pipeline.fit(X_train, y_train)
# Predict and evaluate the model
y_pred_rf = rf_pipeline.predict(X_test)
y_pred_proba_rf = rf_pipeline.predict_proba(X_test)[:,1]
# Evaluation Metrics
accuracy_rf = accuracy_score(y_test, y_pred_rf)
auc_rf = roc_auc_score(y_test, y_pred_proba_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
# Print the metrics
print("Baseline Random Forest Model Evaluation:")
print(f" Accuracy: {accuracy_rf:.4f}")
print(f" AUC: {auc_rf:.4f}")
print(f" Precision: {precision_rf:.4f}")
print(f" Recall: {recall_rf:.4f}")
Baseline Random Forest Model Evaluation: Accuracy: 0.8546 AUC: 0.8661 Precision: 0.5093 Recall: 0.4642
# Tuning
# Create the parameter grid
param_grid = {
'classifier__n_estimators': [20, 30, 50],
#'classifier__max_depth': [5, 10],
'classifier__min_samples_split': [5, 10],
#'classifier__min_samples_leaf': [1, 2, 4]
}
# Instantiate the GridSearchCV object
random_search = RandomizedSearchCV(rf_pipeline, param_grid, cv = 3, n_jobs = -1, scoring = 'roc_auc', verbose = 1)
# Fit the GridSearchCV object to the training data
random_search.fit(X_train, y_train)
# Get the best set of hyperparameters
best_params = random_search.best_params_
# Print the best set of hyperparameters
print("Best parameters:")
for key, value in best_params.items():
print(f" {key}: {value}")
Fitting 3 folds for each of 6 candidates, totalling 18 fits Best parameters: classifier__n_estimators: 50 classifier__min_samples_split: 10
# Fit the model with the best parameters in Gridsearch
rf_pipeline_hpo = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_estimators=50,
min_samples_split=10,
#min_samples_leaf=2,
#max_depth=10,
n_jobs=-1,
random_state=42))])
# Train the pipeline
rf_pipeline_hpo.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median')),
('scaler',
StandardScaler())]),
['installment', 'annual_inc',
'dti', 'fico_range_low',
'inq_last_6mths', 'open_acc',
'pub_rec', 'revol_bal',
'total_acc',
'pub_rec_bankruptcies',
'int_rate', 'revol_util',
'issue_d',
'earliest_cr_line'...
Pipeline(steps=[('imputer',
SimpleImputer(fill_value='missing',
strategy='constant')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['term', 'grade', 'sub_grade',
'emp_title', 'emp_length',
'home_ownership', 'purpose',
'zip_code', 'addr_state',
'verification_status'])])),
('classifier',
RandomForestClassifier(min_samples_split=10, n_estimators=50,
n_jobs=-1, random_state=42))])
model_evaluation(rf_pipeline_hpo)
Training Metrics: Accuracy: 0.9803 Precision (default): 0.9982 Recall (default): 0.9427 F1 Score (default): 0.9697 AUC: 0.9992 Test Metrics: Accuracy: 0.8546 Precision (default): 0.5093 Recall (default): 0.4642 F1 Score (default): 0.4857 AUC: 0.8661
# Fit baseline GBM Model
# Initialize the pipeline with the preprocessor and a Gradient Boosting classifier
gbm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', GradientBoostingClassifier(learning_rate=0.1,
n_estimators=100,
random_state=42
))])
# Train the pipeline
gbm_pipeline.fit(X_train, y_train)
# Predict and evaluate the model
y_pred_gbm = gbm_pipeline.predict(X_test)
y_pred_proba_gbm = gbm_pipeline.predict_proba(X_test)[:,1]
# Evaluation Metrics
accuracy_gbm = accuracy_score(y_test, y_pred_gbm)
auc_gbm = roc_auc_score(y_test, y_pred_proba_gbm)
precision_gbm = precision_score(y_test, y_pred_gbm)
recall_gbm = recall_score(y_test, y_pred_gbm)
# Print the metrics
print("Baseline GBM Model Evaluation:")
print(f" Accuracy: {accuracy_gbm:.4f}")
print(f" AUC: {auc_gbm:.4f}")
print(f" Precision: {precision_gbm:.4f}")
print(f" Recall: {recall_gbm:.4f}")
Baseline GBM Model Evaluation: Accuracy: 0.8217 AUC: 0.8937 Precision: 0.4424 Recall: 0.7889
# Tuning
# Create the parameter grid
param_grid = {
'classifier__n_estimators': [100,200],
'classifier__learning_rate': [0.05, 0.1, 0.2],
}
# Instantiate the GridSearchCV object
grid_search = GridSearchCV(gbm_pipeline, param_grid, cv=3, n_jobs=-1, scoring='roc_auc', verbose=1)
# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)
# Get the best set of hyperparameters
best_params = grid_search.best_params_
# Print the best set of hyperparameters
print("Best parameters:")
for key, value in best_params.items():
print(f" {key}: {value}")
Fitting 3 folds for each of 6 candidates, totalling 18 fits Best parameters: classifier__learning_rate: 0.1 classifier__n_estimators: 200
# Fit the model with the best parameters in Gridsearch
gbm_pipeline_hpo = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', GradientBoostingClassifier(learning_rate=0.1,
n_estimators=200,
random_state=42
))])
# Train the pipeline
gbm_pipeline_hpo.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median')),
('scaler',
StandardScaler())]),
['installment', 'annual_inc',
'dti', 'fico_range_low',
'inq_last_6mths', 'open_acc',
'pub_rec', 'revol_bal',
'total_acc',
'pub_rec_bankruptcies',
'int_rate', 'revol_util',
'issue_d',
'earliest_cr_line'...
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(fill_value='missing',
strategy='constant')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['term', 'grade', 'sub_grade',
'emp_title', 'emp_length',
'home_ownership', 'purpose',
'zip_code', 'addr_state',
'verification_status'])])),
('classifier',
GradientBoostingClassifier(n_estimators=200,
random_state=42))])
model_evaluation(gbm_pipeline_hpo)
Training Metrics: Accuracy: 0.8403 Precision (default): 0.7271 Recall (default): 0.8337 F1 Score (default): 0.7768 AUC: 0.9286 Test Metrics: Accuracy: 0.8209 Precision (default): 0.4404 Recall (default): 0.7798 F1 Score (default): 0.5629 AUC: 0.8936
# Fit baseline MLP Model
# Initialize the pipeline with the preprocessor and a MLP classifier
mlp_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', MLPClassifier(hidden_layer_sizes=(20,10,),
activation='relu',
solver='adam',
max_iter=300,
random_state=42
))])
# Train the pipeline
mlp_pipeline.fit(X_train, y_train)
# Predict and evaluate the model
y_pred_mlp = mlp_pipeline.predict(X_test)
y_pred_proba_mlp = mlp_pipeline.predict_proba(X_test)[:,1]
# Evaluation Metrics
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
auc_mlp = roc_auc_score(y_test, y_pred_proba_mlp)
precision_mlp = precision_score(y_test, y_pred_mlp)
recall_mlp = recall_score(y_test, y_pred_mlp)
# Print the metrics
print("Baseline MLP Model Evaluation:")
print(f" Accuracy: {accuracy_mlp:.4f}")
print(f" AUC: {auc_mlp:.4f}")
print(f" Precision: {precision_mlp:.4f}")
print(f" Recall: {recall_mlp:.4f}")
Baseline MLP Model Evaluation: Accuracy: 0.7648 AUC: 0.7804 Precision: 0.3327 Recall: 0.5868
# Tuning
# Create the parameter grid
param_grid = {
'mlp__hidden_layer_sizes': [(20, 10), (30, 20), (50,)],
'mlp__activation': ['relu', 'tanh'],
'mlp__solver': ['adam', 'sgd'],
'mlp__max_iter': [300, 500],
}
# Instantiate the GridSearchCV object
random_search = RandomizedSearchCV(mlp_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
# Fit the GridSearchCV object to the training data
random_search.fit(X_train, y_train)
# Get the best set of hyperparameters
best_params = random_search.best_params_
# Print the best set of hyperparameters
print("Best Parameters: ", best_params)
--------------------------------------------------------------------------- _RemoteTraceback Traceback (most recent call last) _RemoteTraceback: """ Traceback (most recent call last): File "/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 463, in _process_worker r = call_item() File "/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 291, in __call__ return self.fn(*self.args, **self.kwargs) File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 589, in __call__ return [func(*args, **kwargs) File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 589, in <listcomp> return [func(*args, **kwargs) File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/fixes.py", line 216, in __call__ return self.function(*args, **kwargs) File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 668, in _fit_and_score estimator = estimator.set_params(**cloned_parameters) File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 188, in set_params self._set_params("steps", **kwargs) File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/metaestimators.py", line 54, in _set_params super().set_params(**params) File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 245, in set_params raise ValueError( ValueError: Invalid parameter mlp for estimator Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), ['installment', 'annual_inc', 'dti', 'fico_range_low', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'pub_rec_bankruptcies', 'int_rate', 'revol_util', 'issue_d', 'earliest_cr_line'... Pipeline(steps=[('imputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['term', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'purpose', 'zip_code', 'addr_state', 'verification_status'])])), ('classifier', MLPClassifier(hidden_layer_sizes=(20, 10), max_iter=300, random_state=42))]). Check the list of available parameters with `estimator.get_params().keys()`. """ The above exception was the direct cause of the following exception: ValueError Traceback (most recent call last) Input In [58], in <cell line: 14>() 11 random_search = RandomizedSearchCV(mlp_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1) 13 # Fit the GridSearchCV object to the training data ---> 14 random_search.fit(X_train, y_train) 16 # Get the best set of hyperparameters 17 best_params = random_search.best_params_ File /opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_search.py:891, in BaseSearchCV.fit(self, X, y, groups, **fit_params) 885 results = self._format_results( 886 all_candidate_params, n_splits, all_out, all_more_results 887 ) 889 return results --> 891 self._run_search(evaluate_candidates) 893 # multimetric is determined here because in the case of a callable 894 # self.scoring the return type is only known after calling 895 first_test_score = all_out[0]["test_scores"] File /opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_search.py:1766, in RandomizedSearchCV._run_search(self, evaluate_candidates) 1764 def _run_search(self, evaluate_candidates): 1765 """Search n_iter candidates from param_distributions""" -> 1766 evaluate_candidates( 1767 ParameterSampler( 1768 self.param_distributions, self.n_iter, random_state=self.random_state 1769 ) 1770 ) File /opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_search.py:838, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results) 830 if self.verbose > 0: 831 print( 832 "Fitting {0} folds for each of {1} candidates," 833 " totalling {2} fits".format( 834 n_splits, n_candidates, n_candidates * n_splits 835 ) 836 ) --> 838 out = parallel( 839 delayed(_fit_and_score)( 840 clone(base_estimator), 841 X, 842 y, 843 train=train, 844 test=test, 845 parameters=parameters, 846 split_progress=(split_idx, n_splits), 847 candidate_progress=(cand_idx, n_candidates), 848 **fit_and_score_kwargs, 849 ) 850 for (cand_idx, parameters), (split_idx, (train, test)) in product( 851 enumerate(candidate_params), enumerate(cv.split(X, y, groups)) 852 ) 853 ) 855 if len(out) < 1: 856 raise ValueError( 857 "No fits were performed. " 858 "Was the CV iterator empty? " 859 "Were there no candidates?" 860 ) File /opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:1952, in Parallel.__call__(self, iterable) 1946 # The first item from the output is blank, but it makes the interpreter 1947 # progress until it enters the Try/Except block of the generator and 1948 # reach the first `yield` statement. This starts the aynchronous 1949 # dispatch of the tasks to the workers. 1950 next(output) -> 1952 return output if self.return_generator else list(output) File /opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:1595, in Parallel._get_outputs(self, iterator, pre_dispatch) 1592 yield 1594 with self._backend.retrieval_context(): -> 1595 yield from self._retrieve() 1597 except GeneratorExit: 1598 # The generator has been garbage collected before being fully 1599 # consumed. This aborts the remaining tasks if possible and warn 1600 # the user if necessary. 1601 self._exception = True File /opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:1699, in Parallel._retrieve(self) 1692 while self._wait_retrieval(): 1693 1694 # If the callback thread of a worker has signaled that its task 1695 # triggered an exception, or if the retrieval loop has raised an 1696 # exception (e.g. `GeneratorExit`), exit the loop and surface the 1697 # worker traceback. 1698 if self._aborting: -> 1699 self._raise_error_fast() 1700 break 1702 # If the next job is not ready for retrieval yet, we just wait for 1703 # async callbacks to progress. File /opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:1734, in Parallel._raise_error_fast(self) 1730 # If this error job exists, immediatly raise the error by 1731 # calling get_result. This job might not exists if abort has been 1732 # called directly or if the generator is gc'ed. 1733 if error_job is not None: -> 1734 error_job.get_result(self.timeout) File /opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:736, in BatchCompletionCallBack.get_result(self, timeout) 730 backend = self.parallel._backend 732 if backend.supports_retrieve_callback: 733 # We assume that the result has already been retrieved by the 734 # callback thread, and is stored internally. It's just waiting to 735 # be returned. --> 736 return self._return_or_raise() 738 # For other backends, the main thread needs to run the retrieval step. 739 try: File /opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:754, in BatchCompletionCallBack._return_or_raise(self) 752 try: 753 if self.status == TASK_ERROR: --> 754 raise self._result 755 return self._result 756 finally: ValueError: Invalid parameter mlp for estimator Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), ['installment', 'annual_inc', 'dti', 'fico_range_low', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'pub_rec_bankruptcies', 'int_rate', 'revol_util', 'issue_d', 'earliest_cr_line'... Pipeline(steps=[('imputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['term', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'purpose', 'zip_code', 'addr_state', 'verification_status'])])), ('classifier', MLPClassifier(hidden_layer_sizes=(20, 10), max_iter=300, random_state=42))]). Check the list of available parameters with `estimator.get_params().keys()`.
# Fit the model with the best parameters in Gridsearch
mlp_pipeline_hpo = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', MLPClassifier(hidden_layer_sizes=(20,10,),
activation='relu',
solver='sgd',
max_iter=300,
random_state=42
))])
# Train the pipeline
mlp_pipeline_hpo.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median')),
('scaler',
StandardScaler())]),
['installment', 'annual_inc',
'dti', 'fico_range_low',
'inq_last_6mths', 'open_acc',
'pub_rec', 'revol_bal',
'total_acc',
'pub_rec_bankruptcies',
'int_rate', 'revol_util',
'issue_d',
'earliest_cr_line'...
Pipeline(steps=[('imputer',
SimpleImputer(fill_value='missing',
strategy='constant')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['term', 'grade', 'sub_grade',
'emp_title', 'emp_length',
'home_ownership', 'purpose',
'zip_code', 'addr_state',
'verification_status'])])),
('classifier',
MLPClassifier(hidden_layer_sizes=(20, 10), max_iter=300,
random_state=42, solver='sgd'))])
model_evaluation(mlp_pipeline_hpo)
Training Metrics: Accuracy: 0.9228 Precision (default): 0.8629 Recall (default): 0.9135 F1 Score (default): 0.8875 AUC: 0.9732 Test Metrics: Accuracy: 0.8136 Precision (default): 0.4266 Recall (default): 0.7560 F1 Score (default): 0.5455 AUC: 0.8811
# Fit the model with the best parameters in Gridsearch
# base estimators for stacker
base_estimators = [
('gbm', GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42)),
('rf', RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=42)),
('lr', LogisticRegression(penalty = 'l2', C = 0.1, max_iter = 300, random_state = 42))
]
# final estimator on top
final_estimator = LogisticRegression(penalty = 'l2', C = 0.1, max_iter = 300, random_state = 42)
stacking_classifier = StackingClassifier(estimators = base_estimators,
final_estimator = final_estimator,
cv = 3,
n_jobs = -1
)
sc_pipeline_hpo = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', stacking_classifier)])
# Train the pipeline
sc_pipeline_hpo.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median')),
('scaler',
StandardScaler())]),
['installment', 'annual_inc',
'dti', 'fico_range_low',
'inq_last_6mths', 'open_acc',
'pub_rec', 'revol_bal',
'total_acc',
'pub_rec_bankruptcies',
'int_rate', 'revol_util',
'issue_d',
'earliest_cr_line'...
('classifier',
StackingClassifier(cv=3,
estimators=[('gbm',
GradientBoostingClassifier(n_estimators=200,
random_state=42)),
('rf',
RandomForestClassifier(min_samples_split=10,
n_estimators=50,
random_state=42)),
('lr',
LogisticRegression(C=0.1,
max_iter=300,
random_state=42))],
final_estimator=LogisticRegression(C=0.1,
max_iter=300,
random_state=42),
n_jobs=-1))])
model_evaluation(sc_pipeline_hpo)
Training Metrics: Accuracy: 0.8695 Precision (default): 0.7845 Recall (default): 0.8390 F1 Score (default): 0.8108 AUC: 0.9523 Test Metrics: Accuracy: 0.8350 Precision (default): 0.4633 Recall (default): 0.7310 F1 Score (default): 0.5672 AUC: 0.8911
models = {
'Logistic Regression': lr_pipeline_hpo,
'Random Forest': rf_pipeline_hpo,
'Gradient Boost': gbm_pipeline_hpo,
'Neural Network': mlp_pipeline_hpo,
'Stacking Classifier': sc_pipeline_hpo
}
from sklearn.preprocessing import LabelBinarizer
# Binarize labels for AUC calculation
lb = LabelBinarizer()
y_train_binarized = lb.fit_transform(y_train).ravel()
y_test_binarized = lb.transform(y_test).ravel()
FPRlevel=0.05
for model_name, model_pipeline in models.items():
# Calculate ROC
fpr, tpr, thresholds_roc = roc_curve(y_test_binarized,
model_pipeline.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)
# Calculate precision and recall
precision, recall, thresholds_pr = precision_recall_curve(y_test_binarized,
model_pipeline.predict_proba(X_test)[:,1])
pr_auc = auc(recall, precision)
plt.figure(figsize=(14, 6))
# Plot ROC Curve
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--') # Dashed diagonal
# Highlight the 5% FPR with a vertical line
idx = next(i for i, x in enumerate(fpr) if x >= FPRlevel) # Find the index for FPR just over the preset level
plt.axvline(x=fpr[idx], color='r', linestyle='--') # Vertical line for preset FPR
plt.plot(fpr[idx], tpr[idx], 'ro') # Red dot at the intersection
# Adding a text annotation for the threshold
plt.annotate(f'Threshold={thresholds_roc[idx]:.2f}\nTPR / Recall={tpr[idx]:.2f}\n FPR = {FPRlevel}',
(fpr[idx], tpr[idx]), textcoords="offset points", xytext=(-10,10), ha='center')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'Receiver Operating Characteristic (ROC) Curve - {model_name}')
plt.legend(loc="lower right")
# Plot PR Curve
plt.subplot(1, 2, 2)
plt.plot(recall, precision, label=f'PR curve (area = {pr_auc:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title(f'Precision-Recall Curve - {model_name}')
plt.legend(loc="lower left")
plt.tight_layout()
plt.show()
FPRlevel=0.02
for model_name, model_pipeline in models.items():
# Calculate ROC
fpr, tpr, thresholds_roc = roc_curve(y_test_binarized,
model_pipeline.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)
# Calculate precision and recall
precision, recall, thresholds_pr = precision_recall_curve(y_test_binarized,
model_pipeline.predict_proba(X_test)[:,1])
pr_auc = auc(recall, precision)
plt.figure(figsize=(14, 6))
# Plot ROC Curve
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--') # Dashed diagonal
# Highlight the 5% FPR with a vertical line
idx = next(i for i, x in enumerate(fpr) if x >= FPRlevel) # Find the index for FPR just over the preset level
plt.axvline(x=fpr[idx], color='r', linestyle='--') # Vertical line for preset FPR
plt.plot(fpr[idx], tpr[idx], 'ro') # Red dot at the intersection
# Adding a text annotation for the threshold
plt.annotate(f'Threshold={thresholds_roc[idx]:.2f}\nTPR / Recall={tpr[idx]:.2f}\n FPR = {FPRlevel}',
(fpr[idx], tpr[idx]), textcoords="offset points", xytext=(-10,10), ha='center')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'Receiver Operating Characteristic (ROC) Curve - {model_name}')
plt.legend(loc="lower right")
# Plot PR Curve
plt.subplot(1, 2, 2)
plt.plot(recall, precision, label=f'PR curve (area = {pr_auc:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title(f'Precision-Recall Curve - {model_name}')
plt.legend(loc="lower left")
plt.tight_layout()
plt.show()
# Operating table for each model
for model_name, model_pipeline in models.items():
y_scores = model_pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
# Define target FPR values
target_fpr = np.arange(0.01, 0.11, 0.01) # From 1% to 10%
# Interpolate to find TPR and threshold for target FPRs
interp_tpr = np.interp(target_fpr, fpr, tpr)
interp_thresholds = np.interp(target_fpr, fpr, thresholds)
target_fpr_df = pd.DataFrame({
'Target FPR (%)': target_fpr * 100, # Convert to percentage
'Expected TPR': interp_tpr,
'Threshold': interp_thresholds})
# Print the results
print(f"Operating Table - {model_name}:")
print('---------------------------')
print(target_fpr_df.round(4))
Operating Table - Logistic Regression: --------------------------- Target FPR (%) Expected TPR Threshold 0 1.0 0.1078 0.7584 1 2.0 0.1703 0.7170 2 3.0 0.2418 0.6796 3 4.0 0.2928 0.6532 4 5.0 0.3314 0.6312 5 6.0 0.3678 0.6118 6 7.0 0.4109 0.5944 7 8.0 0.4415 0.5792 8 9.0 0.4699 0.5640 9 10.0 0.4983 0.5507 Operating Table - Random Forest: --------------------------- Target FPR (%) Expected TPR Threshold 0 1.0 0.1180 0.6371 1 2.0 0.1986 0.5980 2 3.0 0.2350 0.5810 3 4.0 0.3087 0.5595 4 5.0 0.3553 0.5445 5 6.0 0.3995 0.5282 6 7.0 0.4415 0.5114 7 8.0 0.4677 0.4982 8 9.0 0.5301 0.4836 9 10.0 0.5607 0.4715 Operating Table - Gradient Boost: --------------------------- Target FPR (%) Expected TPR Threshold 0 1.0 0.1419 0.8130 1 2.0 0.2236 0.7798 2 3.0 0.2951 0.7544 3 4.0 0.3451 0.7362 4 5.0 0.4234 0.7132 5 6.0 0.4654 0.6937 6 7.0 0.5131 0.6732 7 8.0 0.5471 0.6569 8 9.0 0.5800 0.6398 9 10.0 0.6061 0.6248 Operating Table - Neural Network: --------------------------- Target FPR (%) Expected TPR Threshold 0 1.0 0.1271 0.9355 1 2.0 0.2111 0.9048 2 3.0 0.2622 0.8766 3 4.0 0.3110 0.8520 4 5.0 0.3587 0.8251 5 6.0 0.4154 0.7938 6 7.0 0.4484 0.7685 7 8.0 0.4915 0.7389 8 9.0 0.5335 0.7100 9 10.0 0.5687 0.6829 Operating Table - Stacking Classifier: --------------------------- Target FPR (%) Expected TPR Threshold 0 1.0 0.1419 0.8354 1 2.0 0.2247 0.8087 2 3.0 0.3019 0.7796 3 4.0 0.3734 0.7560 4 5.0 0.4313 0.7311 5 6.0 0.4711 0.7094 6 7.0 0.4949 0.6886 7 8.0 0.5471 0.6611 8 9.0 0.5880 0.6375 9 10.0 0.6186 0.6111
for model_name, model_pipeline in models.items():
from sklearn.inspection import permutation_importance
result = permutation_importance(model_pipeline, X_test, y_test,
n_repeats=10, random_state=42,
n_jobs=-1)
feature_names = numeric_features + categorical_features
feature_importances_df = pd.DataFrame({
'Feature': feature_names, # Or 'feature_names' if applicable
'Importance Mean': result.importances_mean,
'Importance Std': result.importances_std
}).sort_values(by='Importance Mean', ascending=False).reset_index(drop=True)
print(f'Feature Importance - {model_name}')
print('------------------------------------------')
print(feature_importances_df.round(4))
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importances_df, x='Importance Mean', y='Feature')
plt.title('Permutation Importance')
plt.show()
Feature Importance - Logistic Regression
------------------------------------------
Feature Importance Mean Importance Std
0 last_credit_pull_d 0.0804 0.0034
1 issue_d 0.0155 0.0022
2 annual_inc 0.0095 0.0018
3 purpose 0.0085 0.0019
4 int_rate 0.0083 0.0025
5 addr_state 0.0056 0.0015
6 inq_last_6mths 0.0053 0.0016
7 sub_grade 0.0024 0.0013
8 open_acc 0.0014 0.0011
9 emp_title 0.0012 0.0008
10 emp_length 0.0010 0.0018
11 grade 0.0009 0.0011
12 revol_bal 0.0006 0.0011
13 zip_code 0.0005 0.0016
14 pub_rec_bankruptcies 0.0005 0.0003
15 home_ownership 0.0004 0.0004
16 verification_status 0.0003 0.0006
17 pub_rec 0.0003 0.0003
18 total_acc 0.0002 0.0006
19 dti 0.0001 0.0007
20 earliest_cr_line -0.0002 0.0003
21 revol_util -0.0005 0.0007
22 installment -0.0006 0.0009
23 term -0.0007 0.0010
24 fico_range_low -0.0014 0.0011
Feature Importance - Random Forest
------------------------------------------
Feature Importance Mean Importance Std
0 last_credit_pull_d 0.0627 0.0039
1 annual_inc 0.0030 0.0015
2 issue_d 0.0027 0.0013
3 purpose 0.0023 0.0016
4 zip_code 0.0011 0.0016
5 pub_rec_bankruptcies 0.0009 0.0003
6 pub_rec 0.0008 0.0009
7 inq_last_6mths 0.0001 0.0011
8 home_ownership 0.0000 0.0010
9 emp_title -0.0002 0.0006
10 revol_bal -0.0002 0.0014
11 fico_range_low -0.0005 0.0012
12 open_acc -0.0009 0.0006
13 verification_status -0.0010 0.0011
14 revol_util -0.0010 0.0017
15 installment -0.0011 0.0018
16 term -0.0012 0.0016
17 int_rate -0.0012 0.0017
18 earliest_cr_line -0.0013 0.0011
19 total_acc -0.0016 0.0008
20 dti -0.0018 0.0014
21 addr_state -0.0024 0.0019
22 emp_length -0.0028 0.0005
23 grade -0.0029 0.0021
24 sub_grade -0.0049 0.0011
Feature Importance - Gradient Boost
------------------------------------------
Feature Importance Mean Importance Std
0 last_credit_pull_d 0.1307 0.0031
1 annual_inc 0.0094 0.0025
2 int_rate 0.0073 0.0031
3 issue_d 0.0027 0.0011
4 purpose 0.0024 0.0010
5 term 0.0017 0.0011
6 addr_state 0.0014 0.0009
7 inq_last_6mths 0.0013 0.0009
8 dti 0.0001 0.0005
9 sub_grade 0.0001 0.0002
10 emp_title 0.0001 0.0008
11 total_acc 0.0000 0.0004
12 revol_util 0.0000 0.0005
13 verification_status 0.0000 0.0000
14 pub_rec_bankruptcies 0.0000 0.0000
15 open_acc 0.0000 0.0000
16 pub_rec -0.0001 0.0001
17 fico_range_low -0.0001 0.0003
18 installment -0.0001 0.0001
19 earliest_cr_line -0.0002 0.0002
20 emp_length -0.0004 0.0004
21 home_ownership -0.0004 0.0005
22 zip_code -0.0005 0.0008
23 revol_bal -0.0010 0.0008
24 grade -0.0010 0.0003
Feature Importance - Neural Network
------------------------------------------
Feature Importance Mean Importance Std
0 last_credit_pull_d 0.1211 0.0040
1 issue_d 0.0093 0.0018
2 annual_inc 0.0080 0.0014
3 int_rate 0.0052 0.0029
4 purpose 0.0049 0.0012
5 addr_state 0.0035 0.0014
6 open_acc 0.0032 0.0015
7 verification_status 0.0031 0.0006
8 inq_last_6mths 0.0029 0.0012
9 dti 0.0015 0.0010
10 grade 0.0013 0.0011
11 total_acc 0.0011 0.0005
12 fico_range_low 0.0011 0.0017
13 sub_grade 0.0009 0.0013
14 installment 0.0008 0.0010
15 pub_rec_bankruptcies 0.0005 0.0006
16 earliest_cr_line 0.0005 0.0006
17 emp_title 0.0003 0.0016
18 pub_rec 0.0003 0.0009
19 home_ownership 0.0001 0.0008
20 revol_util -0.0002 0.0009
21 term -0.0003 0.0008
22 revol_bal -0.0004 0.0009
23 emp_length -0.0015 0.0010
24 zip_code -0.0020 0.0034
Feature Importance - Stacking Classifier
------------------------------------------
Feature Importance Mean Importance Std
0 last_credit_pull_d 0.1167 0.0026
1 annual_inc 0.0097 0.0033
2 int_rate 0.0058 0.0028
3 issue_d 0.0039 0.0014
4 purpose 0.0031 0.0012
5 addr_state 0.0015 0.0006
6 inq_last_6mths 0.0013 0.0017
7 dti 0.0008 0.0006
8 total_acc 0.0008 0.0006
9 earliest_cr_line 0.0007 0.0004
10 revol_util 0.0006 0.0005
11 open_acc 0.0006 0.0006
12 zip_code 0.0004 0.0011
13 emp_title 0.0003 0.0006
14 installment 0.0001 0.0003
15 fico_range_low 0.0001 0.0007
16 pub_rec_bankruptcies -0.0000 0.0001
17 pub_rec -0.0000 0.0002
18 emp_length -0.0001 0.0005
19 home_ownership -0.0001 0.0004
20 verification_status -0.0001 0.0005
21 grade -0.0003 0.0008
22 sub_grade -0.0003 0.0006
23 revol_bal -0.0003 0.0006
24 term -0.0007 0.0014
# !pip install -U scikit-learn
numeric_features
['installment', 'annual_inc', 'dti', 'fico_range_low', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'pub_rec_bankruptcies', 'int_rate', 'revol_util', 'issue_d', 'earliest_cr_line', 'last_credit_pull_d']
def pdp_plot_numeric(var, sample_n):
# var = 'credit_amount'
pdp_values = pd.DataFrame(X_train[var].sort_values().sample(frac=0.1).unique(),columns=[var])
pdp_sample = X_train.sample(sample_n).drop(var, axis=1)
pdp_cross = pdp_sample.merge(pdp_values, how='cross')
pdp_cross['pred'] = gbm_pipeline_hpo.predict_proba(pdp_cross)[:,1]
plt.figure(figsize=(10, 3))
sns.lineplot(x=f"{var}", y='pred', data=pdp_cross)
plt.title(f"Partial Dependance Plot: {var}")
plt.ylabel('Predicted Probability')
plt.xticks(rotation=45)
#plt.ylim(0, 1)
plt.grid(True)
plt.show()
for var in numeric_features:
pdp_plot_numeric(var, sample_n=300)
categorical_features_eda
['term', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'purpose', 'addr_state', 'verification_status']
def pdp_plot_categorical(ax, X_train, var, sample_n, pipeline):
pdp_values = pd.DataFrame(X_train[var].sample(frac=0.1).unique(), columns=[var])
pdp_sample = X_train.drop(columns=[var]).sample(sample_n)
pdp_cross = pdp_sample.assign(key=1).merge(pdp_values.assign(key=1), on='key').drop(columns='key')
pdp_cross['pred'] = pipeline.predict_proba(pdp_cross)[:, 1]
mean_pred = pdp_cross['pred'].mean()
pdp_cross['pred'] = pdp_cross['pred'] - mean_pred
sns.barplot(ax=ax, y='pred', x=var, ci=None, data=pdp_cross, estimator=np.mean)
ax.set_title(f"Partial Dependence Plot: {var}")
ax.set_xlabel(var)
ax.set_ylabel('Predicted Probability')
ax.tick_params(axis='x', rotation=45)
ax.grid(True)
for var in categorical_features_eda:
fig, ax = plt.subplots(figsize=(8, 6))
pdp_plot_categorical(ax, X_train, var, sample_n=500, pipeline=gbm_pipeline_hpo)
plt.show()
import dalex as dx # for explanations
pipeline_explainer = dx.Explainer(gbm_pipeline_hpo, X_test, y_test)
pipeline_explainer
Preparation of a new explainer is initiated -> data : 5956 rows 25 cols -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. -> target variable : 5956 values -> model_class : sklearn.ensemble._gb.GradientBoostingClassifier (default) -> label : Not specified, model's class short name will be used. (default) -> predict function : <function yhat_proba_default at 0x17bbb3a60> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 0.0126, mean = 0.249, max = 0.957 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.902, mean = -0.101, max = 0.982 -> model_info : package sklearn A new explainer has been created!
<dalex._explainer.object.Explainer at 0x280ead0a0>
model_performance = pipeline_explainer.model_performance("classification")
model_performance.result
| recall | precision | f1 | accuracy | auc | |
|---|---|---|---|---|---|
| GradientBoostingClassifier | 0.779796 | 0.440385 | 0.562884 | 0.820853 | 0.893645 |
# Calculate feature importance
fi = pipeline_explainer.model_parts(processes=4)
# Plot feature importance
fi.plot()
def plot_local_breakdown_interactions(top_10_tp, pipeline_explainer):
for index, row in top_10_tp.iterrows():
local_breakdown_exp = pipeline_explainer.predict_parts(
top_10_tp.iloc[index],
type='break_down_interactions',
label=f"record:{index}, prob:{row['pred_proba']:.3f}")
local_breakdown_exp.plot()
X_test['pred']= gbm_pipeline_hpo.predict(X_test)
X_test['pred_proba']= gbm_pipeline_hpo.predict_proba(X_test)[:,1]
X_test[target] = y_test
X_test.head()
| installment | annual_inc | dti | fico_range_low | inq_last_6mths | open_acc | pub_rec | revol_bal | total_acc | pub_rec_bankruptcies | ... | emp_title | emp_length | home_ownership | purpose | zip_code | addr_state | verification_status | pred | pred_proba | loan_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 13494 | 131.95 | 17760.0 | 20.20 | 750.0 | 0.0 | 9.0 | 0.0 | 1639.0 | 24.0 | 0.0 | ... | Fiesta Canning | < 1 year | OWN | debt_consolidation | 856xx | AZ | Source Verified | 0 | 0.051992 | 0 |
| 21759 | 323.85 | 55000.0 | 18.59 | 715.0 | 0.0 | 7.0 | 0.0 | 5571.0 | 20.0 | 0.0 | ... | The NPD Group | 3 years | RENT | home_improvement | 117xx | NY | Not Verified | 0 | 0.025936 | 0 |
| 11247 | 560.56 | 53000.0 | 22.42 | 740.0 | 1.0 | 21.0 | 0.0 | 520.0 | 59.0 | 0.0 | ... | Toyota Motor Mfg WV | 8 years | MORTGAGE | debt_consolidation | 255xx | WV | Verified | 0 | 0.024316 | 0 |
| 25028 | 189.98 | 50000.0 | 21.58 | 665.0 | 2.0 | 17.0 | 0.0 | 13594.0 | 38.0 | 0.0 | ... | Allstate Insurance | 4 years | MORTGAGE | credit_card | 945xx | CA | Not Verified | 1 | 0.754391 | 0 |
| 20440 | 232.58 | 45000.0 | 5.97 | 765.0 | 0.0 | 3.0 | 0.0 | 530.0 | 17.0 | 0.0 | ... | Mission Community Bank | 3 years | RENT | debt_consolidation | 934xx | CA | Not Verified | 1 | 0.734469 | 1 |
5 rows × 28 columns
# Top 10 True Positives (TP)
top_10_tp = (X_test
.query('loan_status == pred and loan_status == 1')
.sort_values(by='pred_proba', ascending=False)
.head(10)
.reset_index(drop=True)
)
top_10_tp
| installment | annual_inc | dti | fico_range_low | inq_last_6mths | open_acc | pub_rec | revol_bal | total_acc | pub_rec_bankruptcies | ... | emp_title | emp_length | home_ownership | purpose | zip_code | addr_state | verification_status | pred | pred_proba | loan_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 489.29 | 65000.0 | 21.93 | 655.0 | 6.0 | 9.0 | 0.0 | 254.0 | 30.0 | NaN | ... | Accushutters | 1 year | MORTGAGE | small_business | 330xx | FL | Not Verified | 1 | 0.956980 | 1 |
| 1 | 795.11 | 616000.0 | 3.83 | 780.0 | 5.0 | 12.0 | 0.0 | 148829.0 | 43.0 | NaN | ... | SmartProperties.org Construction | 10+ years | MORTGAGE | small_business | 328xx | FL | Not Verified | 1 | 0.950501 | 1 |
| 2 | 108.71 | 24000.0 | 2.00 | 660.0 | 0.0 | 3.0 | 0.0 | 469.0 | 7.0 | 0.0 | ... | Shetler Security Services | 2 years | RENT | small_business | 850xx | AZ | Source Verified | 1 | 0.947289 | 1 |
| 3 | 296.46 | 85000.0 | 9.94 | 690.0 | 6.0 | 12.0 | 0.0 | 7491.0 | 15.0 | 0.0 | ... | NaN | < 1 year | RENT | small_business | 916xx | CA | Verified | 1 | 0.941127 | 1 |
| 4 | 57.41 | 35000.0 | 10.94 | 705.0 | 6.0 | 7.0 | 0.0 | 10008.0 | 12.0 | NaN | ... | bay area montesorri | < 1 year | RENT | medical | 337xx | FL | Not Verified | 1 | 0.935319 | 1 |
| 5 | 255.46 | 60000.0 | 3.96 | 765.0 | 5.0 | 4.0 | 0.0 | 7576.0 | 10.0 | 0.0 | ... | NaN | 9 years | OWN | small_business | 973xx | OR | Source Verified | 1 | 0.931537 | 1 |
| 6 | 337.20 | 50000.0 | 18.77 | 735.0 | 2.0 | 2.0 | 0.0 | 517.0 | 16.0 | 0.0 | ... | Acapulco | 2 years | RENT | small_business | 917xx | CA | Source Verified | 1 | 0.923515 | 1 |
| 7 | 235.33 | 14400.0 | 3.00 | 705.0 | 2.0 | 6.0 | 0.0 | 3449.0 | 6.0 | 0.0 | ... | NaN | 2 years | RENT | other | 937xx | CA | Not Verified | 1 | 0.923440 | 1 |
| 8 | 196.18 | 7000.0 | 8.57 | 680.0 | 2.0 | 1.0 | 0.0 | 0.0 | 2.0 | NaN | ... | UWF Parking Services | < 1 year | RENT | house | 325xx | FL | Not Verified | 1 | 0.921421 | 1 |
| 9 | 268.95 | 13000.0 | 0.00 | 715.0 | 3.0 | 5.0 | 0.0 | 0.0 | 5.0 | 0.0 | ... | Mainstay Business Solutions | < 1 year | RENT | educational | 933xx | CA | Not Verified | 1 | 0.917564 | 1 |
10 rows × 28 columns
for index, row in top_10_tp.iterrows():
local_breakdown_exp = pipeline_explainer.predict_parts(
top_10_tp.iloc[index],
type='break_down',
label=f"record:{index}, prob:{row['pred_proba']:.3f}")
local_breakdown_exp.plot()
# Top 10 False Positives (FP)
top_10_fp = (X_test
.query('loan_status != pred and loan_status == 0')
.sort_values(by='pred_proba', ascending=False)
.head(10)
.reset_index(drop=True)
)
top_10_fp
| installment | annual_inc | dti | fico_range_low | inq_last_6mths | open_acc | pub_rec | revol_bal | total_acc | pub_rec_bankruptcies | ... | emp_title | emp_length | home_ownership | purpose | zip_code | addr_state | verification_status | pred | pred_proba | loan_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25.80 | 6000.0 | 0.00 | 680.0 | 1.0 | 5.0 | 0.0 | 0.0 | 5.0 | NaN | ... | NaN | < 1 year | RENT | debt_consolidation | 302xx | GA | Not Verified | 1 | 0.902170 | 0 |
| 1 | 207.22 | 20000.0 | 18.18 | 690.0 | 0.0 | 2.0 | 0.0 | 1315.0 | 11.0 | 0.0 | ... | home depot | 3 years | RENT | debt_consolidation | 935xx | CA | Verified | 1 | 0.899252 | 0 |
| 2 | 46.91 | 17000.0 | 20.40 | 670.0 | 6.0 | 8.0 | 0.0 | 3368.0 | 8.0 | 0.0 | ... | University of Minnesota | 2 years | RENT | major_purchase | 557xx | MN | Not Verified | 1 | 0.897506 | 0 |
| 3 | 69.32 | 49000.0 | 22.10 | 660.0 | 2.0 | 15.0 | 0.0 | 8158.0 | 34.0 | 0.0 | ... | Michael Enterprises | 10+ years | MORTGAGE | small_business | 497xx | MI | Not Verified | 1 | 0.896989 | 0 |
| 4 | 133.81 | 36000.0 | 7.17 | 670.0 | 0.0 | 3.0 | 0.0 | 3471.0 | 9.0 | 0.0 | ... | bed bath and beyond | 3 years | RENT | other | 070xx | NJ | Verified | 1 | 0.888755 | 0 |
| 5 | 34.59 | 18000.0 | 0.00 | 710.0 | 2.0 | 11.0 | 0.0 | 0.0 | 11.0 | 0.0 | ... | Aurora Multimedia | < 1 year | RENT | moving | 088xx | NJ | Source Verified | 1 | 0.884852 | 0 |
| 6 | 186.46 | 36000.0 | 16.03 | 650.0 | 4.0 | 7.0 | 0.0 | 3132.0 | 20.0 | NaN | ... | Wisconsin Business Development Finance Corpora... | < 1 year | RENT | debt_consolidation | 532xx | WI | Not Verified | 1 | 0.883408 | 0 |
| 7 | 437.92 | 42000.0 | 9.03 | 715.0 | 1.0 | 6.0 | 0.0 | 13399.0 | 12.0 | 0.0 | ... | Riverwind Casino | 5 years | MORTGAGE | small_business | 731xx | OK | Verified | 1 | 0.883073 | 0 |
| 8 | 139.17 | 10800.0 | 0.00 | 745.0 | 2.0 | 3.0 | 0.0 | 0.0 | 10.0 | 0.0 | ... | Wells Fargo | < 1 year | RENT | moving | 941xx | CA | Not Verified | 1 | 0.882881 | 0 |
| 9 | 268.95 | 81600.0 | 4.97 | 785.0 | 3.0 | 4.0 | 0.0 | 11.0 | 26.0 | 0.0 | ... | NaN | 5 years | RENT | small_business | 605xx | IL | Verified | 1 | 0.879306 | 0 |
10 rows × 28 columns
for index, row in top_10_fp.iterrows():
local_breakdown_exp = pipeline_explainer.predict_parts(
top_10_fp.iloc[index],
type='break_down',
label=f"record:{index}, prob:{row['pred_proba']:.3f}")
local_breakdown_exp.plot()
# Top 10 False Negatives (FN)
top_10_fn = (X_test
.query('loan_status != pred and loan_status == 1')
.sort_values(by='pred_proba', ascending=True)
.head(10)
.reset_index(drop=True)
)
top_10_fn
| installment | annual_inc | dti | fico_range_low | inq_last_6mths | open_acc | pub_rec | revol_bal | total_acc | pub_rec_bankruptcies | ... | emp_title | emp_length | home_ownership | purpose | zip_code | addr_state | verification_status | pred | pred_proba | loan_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 701.48 | 98000.0 | 20.85 | 780.0 | 0.0 | 20.0 | 0.0 | 12575.0 | 38.0 | 0.0 | ... | East Windsor Regional School District | 10+ years | MORTGAGE | debt_consolidation | 085xx | NJ | Verified | 0 | 0.017775 | 1 |
| 1 | 311.02 | 59000.0 | 9.82 | 780.0 | 0.0 | 4.0 | 0.0 | 83.0 | 16.0 | 0.0 | ... | csc | 10+ years | OWN | major_purchase | 890xx | NV | Verified | 0 | 0.021984 | 1 |
| 2 | 247.29 | 45500.0 | 18.78 | 680.0 | 0.0 | 16.0 | 0.0 | 10933.0 | 33.0 | 0.0 | ... | GEICO | 5 years | MORTGAGE | credit_card | 142xx | NY | Not Verified | 0 | 0.023312 | 1 |
| 3 | 164.86 | 37000.0 | 19.20 | 670.0 | 0.0 | 10.0 | 0.0 | 5505.0 | 15.0 | 0.0 | ... | Macys | < 1 year | RENT | credit_card | 236xx | VA | Verified | 0 | 0.023676 | 1 |
| 4 | 233.06 | 150000.0 | 8.60 | 695.0 | 7.0 | 6.0 | 0.0 | 21293.0 | 9.0 | 0.0 | ... | The Perfect Body, Inc. | 6 years | OWN | credit_card | 327xx | FL | Not Verified | 0 | 0.024803 | 1 |
| 5 | 152.17 | 120000.0 | 17.03 | 720.0 | 0.0 | 14.0 | 0.0 | 19237.0 | 23.0 | 0.0 | ... | ARINC | 9 years | MORTGAGE | home_improvement | 741xx | OK | Not Verified | 0 | 0.024882 | 1 |
| 6 | 154.71 | 54000.0 | 10.71 | 765.0 | 1.0 | 15.0 | 0.0 | 3371.0 | 29.0 | 0.0 | ... | polk county school board | 10+ years | MORTGAGE | home_improvement | 338xx | FL | Not Verified | 0 | 0.025856 | 1 |
| 7 | 252.93 | 110656.0 | 7.82 | 670.0 | 0.0 | 13.0 | 0.0 | 9869.0 | 34.0 | 0.0 | ... | OnLive Inc | 3 years | MORTGAGE | major_purchase | 940xx | CA | Source Verified | 0 | 0.026989 | 1 |
| 8 | 94.82 | 42120.0 | 15.04 | 670.0 | 0.0 | 5.0 | 1.0 | 3266.0 | 16.0 | 1.0 | ... | St Johns med center | 3 years | RENT | moving | 930xx | CA | Not Verified | 0 | 0.027312 | 1 |
| 9 | 104.75 | 84000.0 | 24.06 | 715.0 | 2.0 | 5.0 | 0.0 | 422.0 | 20.0 | 0.0 | ... | Brawley Insurance Services | 3 years | RENT | major_purchase | 937xx | CA | Source Verified | 0 | 0.027315 | 1 |
10 rows × 28 columns
for index, row in top_10_fn.iterrows():
local_breakdown_exp = pipeline_explainer.predict_parts(
top_10_fn.iloc[index],
type='break_down',
label=f"record:{index}, prob:{row['pred_proba']:.3f}")
local_breakdown_exp.plot()
# Load the dataset
df_ho = pd.read_csv('/Users/helenas/Desktop/Machine Learning/Project/Final_project_20240308/loan_holdout.csv')
df_ho
| id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | ... | next_pymnt_d | last_credit_pull_d | collections_12_mths_ex_med | policy_code | application_type | acc_now_delinq | chargeoff_within_12_mths | delinq_amnt | pub_rec_bankruptcies | tax_liens | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1077175 | 1313524 | 2400 | 2400 | 2400.0 | 36 months | 15.96% | 84.33 | C | C5 | ... | NaN | Sep-2016 | 0.0 | 1 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 1075358 | 1311748 | 3000 | 3000 | 3000.0 | 60 months | 12.69% | 67.79 | B | B5 | ... | Oct-2016 | Sep-2016 | 0.0 | 1 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 1075269 | 1311441 | 5000 | 5000 | 5000.0 | 36 months | 7.90% | 156.46 | A | A4 | ... | NaN | Jan-2016 | 0.0 | 1 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 1071570 | 1306721 | 5375 | 5375 | 5350.0 | 60 months | 12.69% | 121.45 | B | B5 | ... | NaN | Sep-2016 | 0.0 | 1 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 1064687 | 1298717 | 9000 | 9000 | 9000.0 | 36 months | 13.49% | 305.38 | C | C1 | ... | NaN | Sep-2016 | 0.0 | 1 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12756 | 88637 | 88629 | 6000 | 6000 | 650.0 | 36 months | 10.59% | 195.28 | C | C2 | ... | Jul-2010 | Oct-2014 | NaN | 1 | INDIVIDUAL | 0.0 | NaN | 0.0 | NaN | NaN |
| 12757 | 85961 | 85923 | 1200 | 1200 | 500.0 | 36 months | 9.01% | 38.17 | B | B2 | ... | Jul-2010 | Sep-2016 | NaN | 1 | INDIVIDUAL | NaN | NaN | NaN | NaN | NaN |
| 12758 | 83979 | 83974 | 3000 | 3000 | 250.0 | 36 months | 7.43% | 93.23 | A | A2 | ... | Jan-2008 | Jun-2007 | NaN | 1 | INDIVIDUAL | NaN | NaN | NaN | NaN | NaN |
| 12759 | 77757 | 70626 | 3000 | 3000 | 0.0 | 36 months | 9.33% | 95.86 | B | B3 | ... | Jul-2010 | May-2007 | NaN | 1 | INDIVIDUAL | NaN | NaN | NaN | NaN | NaN |
| 12760 | 70686 | 70681 | 5000 | 5000 | 0.0 | 36 months | 7.75% | 156.11 | A | A3 | ... | Jul-2010 | Feb-2015 | NaN | 1 | INDIVIDUAL | NaN | NaN | NaN | NaN | NaN |
12761 rows × 51 columns
# Transform rates into numeric
df_ho['int_rate'] = df_ho['int_rate'].str.strip('%').astype('float64')
df_ho['revol_util'] = df_ho['revol_util'].str.strip('%').astype('float64')
# Calculate the count of months from each date up to Dec-2023
def count_month(feature):
dates_datetime = pd.to_datetime(df_ho[feature], format='%b-%Y', errors='coerce')
end_date = pd.Timestamp('Dec-2023')
df_ho[feature] = ((end_date.year - dates_datetime.dt.year) * 12 + (end_date.month - dates_datetime.dt.month))
for i in date_cat_to_num:
count_month(i)
holdout = df_ho[feature_names]
# Predict on holdout set
holdout_proba = gbm_pipeline_hpo.predict_proba(holdout)[:,1]
# Apply threshold for 5% FPR
holdout_pred = np.where(holdout_proba > 0.7132, 1, 0)
df_ho_sub = pd.DataFrame({
"ID": df_ho["id"],
"P_DEFAULT": holdout_pred
})
df_ho_sub['P_DEFAULT'].value_counts()
0 11416 1 1345 Name: P_DEFAULT, dtype: int64
df_ho_sub = pd.DataFrame({
"ID": df_ho["id"],
"P_DEFAULT": holdout_proba
})
df_ho_sub.to_csv('/Users/helenas/Desktop/Machine Learning/Project/Final_project_20240308/loan_holdout_pred.csv')